Authors: Mauro Venticinque, Angelo Schillaci, Daniele Tambone
GitHub project: Bank-Marketing
Date: 2025-03-27
Here we will write some information about the project.
datatable(head(train, 100), options = list(scrollX = TRUE))
str(train)
## 'data.frame': 32950 obs. of 22 variables:
## $ X : int 35248 39854 14530 27822 40199 21227 16836 39099 38565 38152 ...
## $ age : int 30 39 43 27 56 41 57 46 61 35 ...
## $ job : chr "blue-collar" "technician" "services" "student" ...
## $ marital : chr "married" "married" "single" "single" ...
## $ education : chr "professional.course" "university.degree" "high.school" "high.school" ...
## $ default : chr "no" "no" "no" "no" ...
## $ housing : chr "no" "yes" "no" "yes" ...
## $ loan : chr "no" "no" "no" "no" ...
## $ contact : chr "cellular" "cellular" "cellular" "cellular" ...
## $ month : chr "may" "jun" "jul" "mar" ...
## $ day_of_week : chr "fri" "mon" "tue" "thu" ...
## $ duration : int 1357 713 1317 80 230 697 1441 679 106 234 ...
## $ campaign : int 4 2 4 4 2 2 2 1 2 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 1 0 0 0 1 0 0 0 1 0 ...
## $ poutcome : chr "failure" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num -1.8 -1.7 1.4 -1.8 -1.7 1.4 1.4 -3 -3.4 -3.4 ...
## $ cons.price.idx: num 92.9 94.1 93.9 92.8 94.2 ...
## $ cons.conf.idx : num -46.2 -39.8 -42.7 -50 -40.3 -36.1 -42.7 -33 -26.9 -29.8 ...
## $ euribor3m : num 1.25 0.72 4.96 1.65 0.87 ...
## $ nr.employed : num 5099 4992 5228 5099 4992 ...
## $ subscribed : chr "yes" "yes" "yes" "yes" ...
attach(train)
X (Integer): id of customerage (Integer): age of the customerjob (Categorical): occupationmarital (Categorical): marital statuseducation (Categorical): education leveldefault (Binary): has credit in default?housing (Binary): has housing loan?loan (Binary): has personal loan?contact (Categorical): contact communication typemonth (Categorical): last contact month of yearday_of_week (Integer): last contact day of the
weekduration (Integer): last contact duration, in seconds
(numeric). Important note: this attribute highly affects the output
target (e.g., if duration=0 then y=‘no’). Yet, the duration is not known
before a call is performed. Also, after the end of the call y is
obviously known. Thus, this input should only be included for benchmark
purposes and should be discarded if the intention is to have a realistic
predictive modelcampaign (Integer): number of contacts performed during
this campaign and for this client (numeric, includes last contact)pdays (Integer): number of days that passed by after
the client was last contacted from a previous campaign (numeric; -1
means client was not previously contacted)previous (Integer): number of contacts performed before
this campaign and for this clientpoutcome (Categorical): outcome of the previous
marketing campaign (categorical: ‘failure’,‘nonexistent’,‘success’)subscribed (Binary): has the client subscribed a term
deposit?Source: UCI Machine Learning Repository
vis_dat(train)
skim(train)
| Name | train |
| Number of rows | 32950 |
| Number of columns | 22 |
| _______________________ | |
| Column type frequency: | |
| character | 11 |
| numeric | 11 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| job | 0 | 1 | 6 | 13 | 0 | 12 | 0 |
| marital | 0 | 1 | 6 | 8 | 0 | 4 | 0 |
| education | 0 | 1 | 7 | 19 | 0 | 8 | 0 |
| default | 0 | 1 | 2 | 7 | 0 | 3 | 0 |
| housing | 0 | 1 | 2 | 7 | 0 | 3 | 0 |
| loan | 0 | 1 | 2 | 7 | 0 | 3 | 0 |
| contact | 0 | 1 | 8 | 9 | 0 | 2 | 0 |
| month | 0 | 1 | 3 | 3 | 0 | 10 | 0 |
| day_of_week | 0 | 1 | 3 | 3 | 0 | 5 | 0 |
| poutcome | 0 | 1 | 7 | 11 | 0 | 3 | 0 |
| subscribed | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| X | 0 | 1 | 20622.42 | 11882.00 | 1.00 | 10346.50 | 20629.50 | 30883.75 | 41188.00 | ▇▇▇▇▇ |
| age | 0 | 1 | 40.04 | 10.45 | 17.00 | 32.00 | 38.00 | 47.00 | 98.00 | ▅▇▃▁▁ |
| duration | 0 | 1 | 258.66 | 260.83 | 0.00 | 102.00 | 180.00 | 318.00 | 4918.00 | ▇▁▁▁▁ |
| campaign | 0 | 1 | 2.57 | 2.77 | 1.00 | 1.00 | 2.00 | 3.00 | 43.00 | ▇▁▁▁▁ |
| pdays | 0 | 1 | 961.90 | 188.33 | 0.00 | 999.00 | 999.00 | 999.00 | 999.00 | ▁▁▁▁▇ |
| previous | 0 | 1 | 0.17 | 0.49 | 0.00 | 0.00 | 0.00 | 0.00 | 7.00 | ▇▁▁▁▁ |
| emp.var.rate | 0 | 1 | 0.08 | 1.57 | -3.40 | -1.80 | 1.10 | 1.40 | 1.40 | ▁▃▁▁▇ |
| cons.price.idx | 0 | 1 | 93.57 | 0.58 | 92.20 | 93.08 | 93.75 | 93.99 | 94.77 | ▁▆▃▇▂ |
| cons.conf.idx | 0 | 1 | -40.49 | 4.63 | -50.80 | -42.70 | -41.80 | -36.40 | -26.90 | ▅▇▁▇▁ |
| euribor3m | 0 | 1 | 3.62 | 1.74 | 0.63 | 1.34 | 4.86 | 4.96 | 5.04 | ▅▁▁▁▇ |
| nr.employed | 0 | 1 | 5167.01 | 72.31 | 4963.60 | 5099.10 | 5191.00 | 5228.10 | 5228.10 | ▁▁▃▁▇ |
plot_ly(train, x = subscribed, type = 'histogram')
corrplot(cor(train[, c("X", "age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed")]), method="pie")
plot_ly(train, x = job, y = age, type = 'box', color = job)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
plot_ly(train, x = education, y = age, type = 'box', color = education)
eduResp <- ggplot(train, aes(x = education, fill = factor(subscribed))) +
geom_bar(position = "fill") +
coord_flip() +
ylab("Proportion") +
scale_fill_discrete(name = "Subscribed") +
xlab("Education") +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
eduFreq <- ggplot(as.data.frame(table(education)/sum(table(education))*100), aes(x = reorder(education, Freq), y = Freq)) +
geom_bar(stat = "identity", color = "gray", fill = "steelblue", alpha=0.9) +
coord_flip() +
labs(title = "Education", x = "Education Level", y = "Count") +
theme_minimal()
eduFreq / eduResp
ggplot(train, aes(x = poutcome, fill = factor(subscribed))) +
geom_bar(position = "fill") +
ylab("Proportion") +
scale_fill_discrete(name = "Subscribed") +
xlab("Outcome of previous campaign") +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
theme_minimal() +
geom_text(
data = transform(as.data.frame(table(train$poutcome)),
poutcome = Var1,
label = paste0(round(100 * Freq / sum(Freq), 1), "%")),
aes(x = poutcome, y = 1.05, label = label),
inherit.aes = FALSE
)
ggplot(train, aes(age)) + geom_histogram(binwidth=4,position="fill",aes(fill=factor(subscribed)))+scale_fill_discrete(name="Subscribed")+ylab("proportion")+geom_hline(yintercept=0.5)
ggplot(train, aes(x = job, fill = factor(subscribed))) +
geom_bar(position = "fill") +
ylab("Proportion") +
scale_fill_discrete(name = "Subscribed") +
xlab("Job") +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_text(
data = transform(as.data.frame(table(train$job)),
job = Var1,
label = paste0(round(100 * Freq / sum(Freq), 1), "%")),
aes(x = job, y = 1.05, label = label),
inherit.aes = FALSE
)
ggplot(train, aes(cons.price.idx)) + geom_histogram(binwidth=2,position="fill",aes(fill=factor(subscribed)))+scale_fill_discrete(name="Subscribed")+ylab("proportion")+geom_hline(yintercept=0.5)
ggplot(train, aes(cons.conf.idx)) + geom_histogram(binwidth=3,position="fill",aes(fill=factor(subscribed)))+scale_fill_discrete(name="Subscribed")+ylab("proportion")+geom_hline(yintercept=0.5)
ggplot(train, aes(euribor3m)) + geom_histogram(binwidth=3,position="fill",aes(fill=factor(subscribed)))+scale_fill_discrete(name="Subscribed")+ylab("proportion")+geom_hline(yintercept=0.5)
train$emp_cat <- ifelse(train$emp.var.rate < 0, "Negative", "Positive or Zero")
ggplot(train, aes(x = emp_cat, fill = factor(subscribed))) +
geom_bar(position = "fill") +
ylab("Proportion") +
scale_fill_discrete(name = "Subscribed") +
xlab("Employment Variation (±)") +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
theme_minimal() +
geom_text(
data = {
df <- as.data.frame(table(train$emp_cat))
names(df) <- c("emp_cat", "Freq")
df$label <- paste0(round(100 * df$Freq / sum(df$Freq), 1), "%")
df$y <- 1.05
df
},
aes(x = emp_cat, y = y, label = label),
inherit.aes = FALSE
)
ggplot(train, aes(x = default, fill = factor(subscribed))) +
geom_bar(position = "fill") +
ylab("Proportion") +
scale_fill_discrete(name = "Subscribed") +
xlab("Default") +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
theme_minimal() +
geom_text(
data = transform(as.data.frame(table(train$default)),
default = Var1,
label = paste0(round(100 * Freq / sum(Freq), 1), "%")),
aes(x = default, y = 1.05, label = label),
inherit.aes = FALSE
)
ggpairs(train[, c("age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed")], columns = 1:10,
lower = list(continuous = wrap("points", alpha = 0.5, color = "darkred", size=0.5)),
title='Scatterplot', axisLabels='none')
1.1.3 Social and economic context attributes
emp.var.rate(Integer): employment variation rate - quarterly indicatorcons.price.idx(Integer): consumer price index - monthly indicatorcons.conf.idx(Integer): consumer confidence index - monthly indicatoreuribor3m(Integer): euribor 3 month rate - daily indicatornr.employed(Integer): number of employees - quarterly indicator